This R Markdown documents the data cleaning analysis carried out on the dataset.

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

Data Cleaning

In carrying out data cleaning, a few key variables come to mind:

Uniqueness of Players

# the number of rows of the data for league is consistent
map_dbl(list(league_defense, league_shooting, league_passing, 
             league_goalkeeping), nrow)
[1] 5554 5554 5554  413
# However, the number of players in tournaments exhibit inconsistencies
map_dbl(list(tournament_defense, tournament_shooting, tournament_passing, 
             tournament_goalkeeping), nrow)%>%
  set_names(c("tournament_defense","tournament_shooting",
             "tournament_passing","tournament_goalkeeping"))
    tournament_defense    tournament_shooting     tournament_passing 
                   488                   2015                    488 
tournament_goalkeeping 
                   129 

The next step is to ensure that the distinct players are the same. We first try the league players. We can confirm that the list of shooting, passing and defense players are indeed correct.

check_list_equal<- function(out, input){
  if(!all(out==input)){
    return(done(FALSE))
  }
  input
}

# If output is not consistent
list(2,2,3,4)%>%
  reduce(check_list_equal)
[1] FALSE
# There are 3429 league players that are consistent. 
league_players<-list(league_defense, league_shooting, league_passing)%>%
  map(function(db) db%>% 
      distinct(Player)%>%
      arrange(Player))%>%
  reduce(check_list_equal)
league_players

Tournament player data cleaning is slightly more complicated, mainly as shooting and goalkeeping data is available for 2020 while the other data sets only have 2021 data.

map(list(tournament_defense, tournament_shooting, tournament_passing, 
             tournament_goalkeeping), 
        ~.[["Year"]]%>%
          unique())%>%
  set_names(c("tournament_defense","tournament_shooting",
             "tournament_passing","tournament_goalkeeping"))
$tournament_defense
[1] 2021

$tournament_shooting
[1] 2020 2021

$tournament_passing
[1] 2021

$tournament_goalkeeping
[1] 2020 2021

First check that the defense and passing have the same players. followed by checking if shooting is indeed a subset of the larger player pool.

# For tournament
tournament_players<-list(tournament_defense, tournament_passing)%>%
  map(function(db) db%>% 
      distinct(Player, Year, Nation)%>%
      arrange(Player,Year,Nation))%>%
  reduce(check_list_equal)

check_player_pool<-function(big_pool, comparator,
                            pool_col="Player", comparator_col="Player"){
  big_pool<-distinct(big_pool, across(matches(pool_col)))
  comparator<-distinct(comparator, across(matches(comparator_col)))
  
  if(all(comparator[[comparator_col]]%in% big_pool[[pool_col]])){
    message("All tournament shooting players are in the player pool.")
  }
  else{
    warning("Not all tournament shooting players are in the player pool.")
    print(
      paste("Of the", nrow(comparator), "Players, only",
          sum(comparator[[comparator_col]]%in% big_pool[[pool_col]]),
          "are in the bigger pool."))
    comparator[!comparator[[comparator_col]]%in% big_pool[[pool_col]],]
  }
}

check_player_pool(tournament_players, 
                  tournament_shooting%>%
                    filter(Year==2021))
All tournament shooting players are in the player pool.

Age Issue

A separate issue observed is that ages in the tournaments are not consistent:

# Does age increase over time?
# Confusingly, it decreases over time.
tournament_shooting%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  select(Player,Year, Age)

tournament_shooting%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  pivot_wider(id_cols = Player,names_from = Year, values_from = Age,
              names_prefix = "year_")%>%
  mutate(diff = year_2021 - year_2020)%>%
  group_by(diff)%>%
  summarise(count = n())

tournament_goalkeeping%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  select(Player,Year, Age)

This is not the case for league data, however.

map(list(league_defense, league_shooting, league_passing, 
             league_goalkeeping), 
    function(db) {
      db%>%
        group_by(Player)%>%
        mutate(occurance = max(Year)-min(Year))%>%
        filter(occurance>=1)%>%
        select(Player,Year, Age)%>%
        ungroup()%>%
        arrange(Player,Year, desc(Age))%>%
        distinct()%>%
        pivot_wider(names_from = Year, values_from = Age,
                    names_prefix = "year_")%>%
        mutate(diff = year_2021 - year_2020)%>%
        group_by(diff)%>%
        summarise(count = n())
    })
[[1]]

[[2]]

[[3]]

[[4]]
NA

Compare the leagued data with the tournament data:

map(list(tournament_shooting, tournament_goalkeeping), 
    function(db) {
      db%>%
        group_by(Player)%>%
        mutate(occurance = max(Year)-min(Year))%>%
        filter(occurance>=1)%>%
        select(Player,Year, Age)%>%
        ungroup()%>%
        arrange(Player,Year, desc(Age))%>%
        distinct()%>%
        pivot_wider(names_from = Year, values_from = Age,
                    names_prefix = "year_")%>%
        mutate(diff = year_2021 - year_2020)%>%
        group_by(diff)%>%
        summarise(count = n())
    })%>%
  set_names(c("tournament_shooting","tournament_goalkeeping"))
$tournament_shooting

$tournament_goalkeeping
NA

Find the average age difference between calculated and the actual age.

tournament_shooting%>%
  mutate(calc_ages=Year - Born)%>%
  mutate(diff = calc_ages - Age)%>%
  group_by(diff)%>%
  summarise(count =n())

league_shooting%>%
  mutate(calc_ages=Year - Born)%>%
  mutate(diff = calc_ages - Age)%>%
  group_by(diff)%>%
  summarise(count =n())

The assumption used here is that the age should be standardized to year less birth year. This ensures consistencies between the data set. League ages tend to underestimate the true age, while tournament ages tend to overestimate the ages.

tournament_shooting%>%
  select(Player, Year, Age)%>%
  rename(age_tourny=Age)%>%
  filter(Player %in% league_players$Player)%>%
  inner_join(league_defense%>%
              arrange(Player, Year, Age)%>%
              distinct(Player, Year, Age),
            by = c("Player", "Year"))%>%
  mutate(diff = age_tourny - Age)%>%
  group_by(diff)%>%
  summarise(count = n())
---
title: "R Notebook"
output: html_notebook
---
```{r, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE
)
source("00 set up.R")
```

This R Markdown documents the data cleaning analysis carried out on the dataset.

```
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
```

## Data Cleaning
In carrying out data cleaning, a few key variables come to mind:

* Uniqueness of Players
* Age
* Consistency in nationality

### Uniqueness of Players
```{r, echo = T}
# the number of rows of the data for league is consistent
map_dbl(list(league_defense, league_shooting, league_passing, 
             league_goalkeeping), nrow)

# However, the number of players in tournaments exhibit inconsistencies
map_dbl(list(tournament_defense, tournament_shooting, tournament_passing, 
             tournament_goalkeeping), nrow)%>%
  set_names(c("tournament_defense","tournament_shooting",
             "tournament_passing","tournament_goalkeeping"))
```

The next step is to ensure that the distinct players are the same. We first try the league players. We can confirm that the list of shooting, passing and defense players are indeed correct.

```{r, echo = T}
check_list_equal<- function(out, input){
  if(!all(out==input)){
    return(done(FALSE))
  }
  input
}

# If output is not consistent
list(2,2,3,4)%>%
  reduce(check_list_equal)

# There are 3429 league players that are consistent. 
league_players<-list(league_defense, league_shooting, league_passing)%>%
  map(function(db) db%>% 
      distinct(Player)%>%
      arrange(Player))%>%
  reduce(check_list_equal)
league_players
```

Tournament player data cleaning is slightly more complicated, mainly as shooting and goalkeeping data is available for 2020 while the other data sets only have 2021 data.

```{r}
map(list(tournament_defense, tournament_shooting, tournament_passing, 
             tournament_goalkeeping), 
        ~.[["Year"]]%>%
          unique())%>%
  set_names(c("tournament_defense","tournament_shooting",
             "tournament_passing","tournament_goalkeeping"))
```

First check that the defense and passing have the same players.  followed by checking if shooting is indeed a subset of the larger player pool.

```{r}
# For tournament
tournament_players<-list(tournament_defense, tournament_passing)%>%
  map(function(db) db%>% 
      distinct(Player, Year, Nation)%>%
      arrange(Player,Year,Nation))%>%
  reduce(check_list_equal)

check_player_pool<-function(big_pool, comparator,
                            pool_col="Player", comparator_col="Player"){
  big_pool<-distinct(big_pool, across(matches(pool_col)))
  comparator<-distinct(comparator, across(matches(comparator_col)))
  
  if(all(comparator[[comparator_col]]%in% big_pool[[pool_col]])){
    message("All tournament shooting players are in the player pool.")
  }
  else{
    warning("Not all tournament shooting players are in the player pool.")
    print(
      paste("Of the", nrow(comparator), "Players, only",
          sum(comparator[[comparator_col]]%in% big_pool[[pool_col]]),
          "are in the bigger pool."))
    comparator[!comparator[[comparator_col]]%in% big_pool[[pool_col]],]
  }
}

check_player_pool(tournament_players, 
                  tournament_shooting%>%
                    filter(Year==2021))

```

### Age Issue
A separate issue observed is that ages in the tournaments are not consistent:

```{r tournament_age_issues}
# Does age increase over time?
# Confusingly, it decreases over time.
tournament_shooting%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  select(Player,Year, Age)

tournament_shooting%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  pivot_wider(id_cols = Player,names_from = Year, values_from = Age,
              names_prefix = "year_")%>%
  mutate(diff = year_2021 - year_2020)%>%
  group_by(diff)%>%
  summarise(count = n())

tournament_goalkeeping%>%
  group_by(Player)%>%
  mutate(occurance = n())%>%
  filter(occurance>1)%>%
  arrange(Player,Year, Age)%>%
  select(Player,Year, Age)
```

This is not the case for league data, however.

```{r league_age_summary}
map(list(league_defense, league_shooting, league_passing, 
             league_goalkeeping), 
    function(db) {
      db%>%
        group_by(Player)%>%
        mutate(occurance = max(Year)-min(Year))%>%
        filter(occurance>=1)%>%
        select(Player,Year, Age)%>%
        ungroup()%>%
        arrange(Player,Year, desc(Age))%>%
        distinct()%>%
        pivot_wider(names_from = Year, values_from = Age,
                    names_prefix = "year_")%>%
        mutate(diff = year_2021 - year_2020)%>%
        group_by(diff)%>%
        summarise(count = n())
    })
```

Compare the leagued data with the tournament data:

```{r tournament_age_summary}
map(list(tournament_shooting, tournament_goalkeeping), 
    function(db) {
      db%>%
        group_by(Player)%>%
        mutate(occurance = max(Year)-min(Year))%>%
        filter(occurance>=1)%>%
        select(Player,Year, Age)%>%
        ungroup()%>%
        arrange(Player,Year, desc(Age))%>%
        distinct()%>%
        pivot_wider(names_from = Year, values_from = Age,
                    names_prefix = "year_")%>%
        mutate(diff = year_2021 - year_2020)%>%
        group_by(diff)%>%
        summarise(count = n())
    })%>%
  set_names(c("tournament_shooting","tournament_goalkeeping"))
```

Find the average age difference between calculated and the actual age.

```{r}
tournament_shooting%>%
  mutate(calc_ages=Year - Born)%>%
  mutate(diff = calc_ages - Age)%>%
  group_by(diff)%>%
  summarise(count =n())

league_shooting%>%
  mutate(calc_ages=Year - Born)%>%
  mutate(diff = calc_ages - Age)%>%
  group_by(diff)%>%
  summarise(count =n())
```

The assumption used here is that the age should be standardized to year less birth year. This ensures consistencies between the data set. League ages tend to underestimate the true age, while tournament ages tend to overestimate the ages.

```{r}
tournament_shooting%>%
  select(Player, Year, Age)%>%
  rename(age_tourny=Age)%>%
  filter(Player %in% league_players$Player)%>%
  inner_join(league_defense%>%
              arrange(Player, Year, Age)%>%
              distinct(Player, Year, Age),
            by = c("Player", "Year"))%>%
  mutate(diff = age_tourny - Age)%>%
  group_by(diff)%>%
  summarise(count = n())
```

